The Breast Cancer Wisconsin (Diagnostic) dataset is a renowned collection of data used extensively in machine learning and medical research. Originating from digitized images of fine needle aspirates (FNA) of breast masses, this dataset facilitates the analysis of cell nuclei characteristics to aid in the diagnosis of breast cancer. In this article, we delve into the attributes, statistics, and significance of this dataset.
A new classifier for breast cancer detection based on Naïve Bayesian
library(tidyverse)
library(e1071)
library(here)
library(ggcorrplot)
library(ggpubr)
library(janitor)
library(plotly)
options(scipen = 999)
breast_cancer_data <- read_csv(here("data/wisconsin_breast_cancer_data.csv"))
Let’s test the assumption that are features are independent from each other
Drop columns that we don’t need in the correlation matrix
breast_cancer_data_cor <- breast_cancer_data %>%
select(-id, -diagnosis, -...33)
Create a matrix from the dataframe
breast_cancer_matrix <- as.matrix(breast_cancer_data_cor)
Calculate the correlations using cor()
breast_cancer_correlations <- cor(breast_cancer_matrix)
Plot the correlations
ggcorrplot(breast_cancer_correlations, type = "lower", lab = TRUE, lab_size = 1)
visualizing correlations between two features
ggplot(breast_cancer_data, aes(x = radius_mean, y = perimeter_mean)) +
geom_point() +
geom_smooth() +
stat_cor()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
breast_cancer_mod <- naiveBayes(diagnosis ~., data = breast_cancer_data)
breast_cancer_mod
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## B M
## 0.6267606 0.3732394
##
## Conditional probabilities:
## id
## Y [,1] [,2]
## B 26618125 116895515
## M 36818050 137896550
##
## radius_mean
## Y [,1] [,2]
## B 12.15885 1.767710
## M 17.46283 3.203971
##
## texture_mean
## Y [,1] [,2]
## B 17.89615 3.985221
## M 21.60491 3.779470
##
## perimeter_mean
## Y [,1] [,2]
## B 78.16011 11.71493
## M 115.36538 21.85465
##
## area_mean
## Y [,1] [,2]
## B 463.5817 133.6395
## M 978.3764 367.9380
##
## smoothness_mean
## Y [,1] [,2]
## B 0.09258958 0.01329740
## M 0.10289849 0.01260824
##
## compactness_mean
## Y [,1] [,2]
## B 0.08018705 0.03374184
## M 0.14518778 0.05398750
##
## concavity_mean
## Y [,1] [,2]
## B 0.0461870 0.04343437
## M 0.1607747 0.07501933
##
## concave points_mean
## Y [,1] [,2]
## B 0.02578965 0.01587242
## M 0.08799000 0.03437391
##
## symmetry_mean
## Y [,1] [,2]
## B 0.1742295 0.02482803
## M 0.1929090 0.02763809
##
## fractal_dimension_mean
## Y [,1] [,2]
## B 0.06287871 0.006753448
## M 0.06268009 0.007573315
##
## radius_se
## Y [,1] [,2]
## B 0.2837969 0.1125986
## M 0.6090825 0.3450386
##
## texture_se
## Y [,1] [,2]
## B 1.219797 0.5899058
## M 1.210915 0.4831781
##
## perimeter_se
## Y [,1] [,2]
## B 1.998783 0.7717058
## M 4.323929 2.5685457
##
## area_se
## Y [,1] [,2]
## B 21.14072 8.85529
## M 72.67241 61.35527
##
## smoothness_se
## Y [,1] [,2]
## B 0.007195921 0.003064917
## M 0.006780094 0.002890430
##
## compactness_se
## Y [,1] [,2]
## B 0.02148538 0.01635023
## M 0.03228117 0.01838719
##
## concavity_se
## Y [,1] [,2]
## B 0.02606976 0.03293560
## M 0.04182401 0.02160343
##
## concave points_se
## Y [,1] [,2]
## B 0.009885343 0.005692600
## M 0.015060472 0.005517362
##
## symmetry_se
## Y [,1] [,2]
## B 0.02056646 0.007000697
## M 0.02047240 0.010064888
##
## fractal_dimension_se
## Y [,1] [,2]
## B 0.003638447 0.002942005
## M 0.004062406 0.002041498
##
## radius_worst
## Y [,1] [,2]
## B 13.39082 1.973166
## M 21.13481 4.283569
##
## texture_worst
## Y [,1] [,2]
## B 23.49581 5.489610
## M 29.31821 5.434804
##
## perimeter_worst
## Y [,1] [,2]
## B 87.08416 13.46504
## M 141.37033 29.45706
##
## area_worst
## Y [,1] [,2]
## B 559.7149 163.1035
## M 1422.2863 597.9677
##
## smoothness_worst
## Y [,1] [,2]
## B 0.1250578 0.01995512
## M 0.1448452 0.02186983
##
## compactness_worst
## Y [,1] [,2]
## B 0.1830047 0.09209558
## M 0.3748241 0.17037198
##
## concavity_worst
## Y [,1] [,2]
## B 0.1667047 0.1402874
## M 0.4506056 0.1815067
##
## concave points_worst
## Y [,1] [,2]
## B 0.07465346 0.03562873
## M 0.18223731 0.04630779
##
## symmetry_worst
## Y [,1] [,2]
## B 0.2701986 0.04179392
## M 0.3234679 0.07468496
##
## fractal_dimension_worst
## Y [,1] [,2]
## B 0.07946750 0.01381510
## M 0.09152995 0.02155289
##
## ...33
## Y FALSE TRUE
## B
## M
predict diagnosis in breast cancer dataset
breast_cancer_data <- breast_cancer_data %>%
mutate(predicted_diagnosis = predict(breast_cancer_mod, newdata = .))
create the confusion matrix
breast_cancer_data %>%
tabyl(diagnosis, predicted_diagnosis) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_ns
## diagnosis B M
## B 96.07% (342) 3.93% (14)
## M 10.38% (22) 89.62% (190)
area_mean_density <- ggplot(breast_cancer_data, aes(x = area_mean, fill = diagnosis)) +
geom_density(alpha = 0.7) +
theme_minimal() +
scale_fill_viridis_d(option = "magma")
ggplotly(area_mean_density)
# Parameters for the first normal distribution
mean1 <- 978.3764
sd1 <- 367.9380
# Parameters for the second normal distribution
mean2 <- 463.5817
sd2 <- 133.6395
# Create a data frame for both normal distributions
x <- seq(min(mean1 - 4*sd1, mean2 - 4*sd2), max(mean1 + 4*sd1, mean2 + 4*sd2), length.out = 1000)
y1 <- dnorm(x, mean = mean1, sd = sd1)
y2 <- dnorm(x, mean = mean2, sd = sd2)
# Round values to 5 decimal places for tooltips
normal_data <- data.frame(
x = x,
y1 = round(y1, 5),
y2 = round(y2, 5)
)
# Create the ggplot with both normal distributions
normal_dist_plot <- ggplot(normal_data, aes(x = x)) +
geom_line(aes(y = y1, color = "Distribution 1")) +
geom_line(aes(y = y2, color = "Distribution 2")) +
labs(title = "Assumed Normal Distributions", x = "area mean", y = "Density") +
scale_color_manual(values = c("Distribution 1" = "gold", "Distribution 2" = "grey")) +
theme_minimal() +
xlim(143, 2500)
# Make the plot interactive and round tooltips to 5 decimal places
ggplotly(normal_dist_plot, tooltip = c("x", "y"))
breast_cancer_data <- breast_cancer_data %>%
mutate(predicted_diagnosis_raw = predict(breast_cancer_mod, newdata = ., type = "raw"))
prediction_hist <- ggplot(breast_cancer_data, aes(x = predicted_diagnosis_raw[, "B"], fill = diagnosis)) +
geom_histogram(bins = 100) +
theme_minimal()+
scale_fill_viridis_d()
ggplotly(prediction_hist)
breast_cancer_data <- breast_cancer_data %>%
mutate(confusion_category = case_when(
diagnosis == "M" & predicted_diagnosis == "M" ~ "True Positive",
diagnosis == "M" & predicted_diagnosis == "B" ~ "False Negative",
diagnosis == "B" & predicted_diagnosis == "B" ~ "True Negative",
diagnosis == "B" & predicted_diagnosis == "M" ~ "False Positive",
TRUE ~ "Unknown" # For any unforeseen cases, if needed
))
table(breast_cancer_data$confusion_category)
##
## False Negative False Positive True Negative True Positive
## 22 14 342 190
smoothness_worst_density <- ggplot(breast_cancer_data, aes(x = smoothness_worst, fill = confusion_category)) +
geom_density(alpha = 0.5) +
theme_minimal() +
scale_fill_viridis_d(option = "magma")
ggplotly(smoothness_worst_density)
area_density_plot <- ggplot(breast_cancer_data, aes(x = area_mean, fill = confusion_category)) +
geom_density(alpha = 0.5) +
theme_minimal() +
scale_fill_viridis_d(option = "magma")
ggplotly(area_density_plot)